{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e551fbf2-96df-4964-8aec-3edfd45e2d69",
   "metadata": {},
   "source": [
    "Q1 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e4a53e3a-97a3-4c60-ac67-c3e8e2807d6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "61cbdb0c-0c30-4ae1-9d81-228f81fcb90f",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_question = \"I just discovered the course. Can I still join it?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4df54232-782f-4af5-b501-a1295396ad45",
   "metadata": {},
   "outputs": [],
   "source": [
    "v = embedding_model.encode(user_question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4e55ff51-6df4-406a-9024-5798efe511fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.07822262"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "3dfbcba7-ab97-4042-b6ad-5ccd249954d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "768"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(v)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12c50bcd-4b32-48f0-82fd-1ed093205dd5",
   "metadata": {},
   "source": [
    "Q2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "81ea965c-5e20-4009-9e32-04bcd9bc34a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests \n",
    "\n",
    "base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'\n",
    "relative_url = '03-vector-search/eval/documents-with-ids.json'\n",
    "docs_url = f'{base_url}/{relative_url}?raw=1'\n",
    "docs_response = requests.get(docs_url)\n",
    "documents = docs_response.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e9600ffd-0798-4e22-bd1a-e48f77468360",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = [d for d in documents if d['course'] == \"machine-learning-zoomcamp\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8b51fe26-bf37-4509-a64b-78ac4f693927",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "375"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "100d4859-c141-4bcb-a709-3ef4b29f2ff9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "d9f90998-f699-41c6-821c-40e1fb41525d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e478da599a5042f9b0b07ae82f39129a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/375 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "embeddings = []\n",
    "\n",
    "for d in tqdm(documents):\n",
    "    qa_text = '{question} {text}'.format(**d)\n",
    "    v_qa = embedding_model.encode(qa_text)\n",
    "    embeddings.append(v_qa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "f6eef854-e5e6-4197-b325-ca891d27409d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7d7b0a91-518b-480d-aaf7-9f252179ed27",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.array(embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "73609f26-b26a-44ff-b321-b847a9a31043",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(375, 768)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "86684b01-4659-4269-9aef-7e5f4d6ef66f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.08805912,  0.01559365,  0.07925579, ...,  0.00745123,\n",
       "         0.00241912,  0.01147128],\n",
       "       [ 0.07066917, -0.01930564,  0.07738367, ...,  0.04495566,\n",
       "         0.01078552, -0.02316587],\n",
       "       [ 0.10229155, -0.0166324 ,  0.03414484, ...,  0.03215319,\n",
       "        -0.04317445, -0.04574377],\n",
       "       ...,\n",
       "       [-0.00768201,  0.01075665,  0.01190489, ...,  0.06066166,\n",
       "        -0.03393208,  0.01605328],\n",
       "       [ 0.13408284, -0.03755201,  0.0197653 , ...,  0.02163629,\n",
       "        -0.01921146,  0.03690196],\n",
       "       [ 0.0438601 , -0.0130077 ,  0.06373127, ...,  0.03338453,\n",
       "         0.02258542, -0.06305875]], dtype=float32)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c96be77-4ef1-48f0-b13d-feecb3fe7fa2",
   "metadata": {},
   "source": [
    "Q3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "45383696-31e6-4b40-a416-bdfe5b5113d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "scores = X.dot(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3eeb1745-3dc8-48ee-b126-015f809ddb76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6506575"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "9dafd113-67b4-406b-baff-3e0b7ec4548e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.argmax()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "612cc0c0-f673-40f7-8ec9-7ab11b59bad2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',\n",
       " 'section': 'General course-related questions',\n",
       " 'question': 'The course has already started. Can I still join it?',\n",
       " 'course': 'machine-learning-zoomcamp',\n",
       " 'id': 'ee58a693'}"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[14]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1f1ac87-67d7-41ff-b808-f870c3206de0",
   "metadata": {},
   "source": [
    "Vector search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "97014767-e8ac-4994-a33d-3d3c15becf70",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = (-scores).argsort()[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "1168a432-c7ad-4db0-864a-13ad0f2cc009",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp', 'id': 'ee58a693'}\n",
      "{'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\\nOr you can just use this link: http://mlzoomcamp.com/#syllabus', 'section': 'General course-related questions', 'question': 'I just joined. What should I do next? How can I access course materials?', 'course': 'machine-learning-zoomcamp', 'id': '0a278fb2'}\n",
      "{'text': \"The process is automated now, so you should receive the email eventually. If you haven’t, check your promotions tab in Gmail as well as spam.\\nIf you unsubscribed from our newsletter, you won't get course related updates too.\\nBut don't worry, it’s not a problem. To make sure you don’t miss anything, join the #course-ml-zoomcamp channel in Slack and our telegram channel with announcements. This is enough to follow the course.\", 'section': 'General course-related questions', 'question': \"I filled the form, but haven't received a confirmation email. Is it normal?\", 'course': 'machine-learning-zoomcamp', 'id': '6ba259b1'}\n",
      "{'text': 'Technically, yes. Advisable? Not really. Reasons:\\nSome homework(s) asks for specific python library versions.\\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\\ntx[source]', 'section': 'Miscellaneous', 'question': 'Can I do the course in other languages, like R or Scala?', 'course': 'machine-learning-zoomcamp', 'id': '9f261648'}\n",
      "{'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.', 'section': 'General course-related questions', 'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?', 'course': 'machine-learning-zoomcamp', 'id': 'e7ba6b8a'}\n"
     ]
    }
   ],
   "source": [
    "for i in idx:\n",
    "    print(documents[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "5f372a5b-b0d0-4612-aec3-0a5039f7d83d",
   "metadata": {},
   "outputs": [],
   "source": [
    "class VectorSearchEngine():\n",
    "    def __init__(self, documents, embeddings):\n",
    "        self.documents = documents\n",
    "        self.embeddings = embeddings\n",
    "\n",
    "    def search(self, v_query, num_results=10):\n",
    "        scores = self.embeddings.dot(v_query)\n",
    "        idx = np.argsort(-scores)[:num_results]\n",
    "        return [self.documents[i] for i in idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "e533fcff-1022-4408-970d-542e3744b15e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'The course has already started. Can I still join it?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': 'ee58a693'},\n",
       " {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\\nOr you can just use this link: http://mlzoomcamp.com/#syllabus',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'I just joined. What should I do next? How can I access course materials?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '0a278fb2'},\n",
       " {'text': \"The process is automated now, so you should receive the email eventually. If you haven’t, check your promotions tab in Gmail as well as spam.\\nIf you unsubscribed from our newsletter, you won't get course related updates too.\\nBut don't worry, it’s not a problem. To make sure you don’t miss anything, join the #course-ml-zoomcamp channel in Slack and our telegram channel with announcements. This is enough to follow the course.\",\n",
       "  'section': 'General course-related questions',\n",
       "  'question': \"I filled the form, but haven't received a confirmation email. Is it normal?\",\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '6ba259b1'},\n",
       " {'text': 'Technically, yes. Advisable? Not really. Reasons:\\nSome homework(s) asks for specific python library versions.\\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\\ntx[source]',\n",
       "  'section': 'Miscellaneous',\n",
       "  'question': 'Can I do the course in other languages, like R or Scala?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '9f261648'},\n",
       " {'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': 'e7ba6b8a'}]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "search_engine = VectorSearchEngine(documents=documents, embeddings=X)\n",
    "search_engine.search(v, num_results=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b86f484-5273-4a43-9513-47482b281698",
   "metadata": {},
   "source": [
    "Q4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "3e349117-1b81-40de-ba14-46982c6cb8e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'\n",
    "relative_url = '03-vector-search/eval/ground-truth-data.csv'\n",
    "ground_truth_url = f'{base_url}/{relative_url}?raw=1'\n",
    "\n",
    "df_ground_truth = pd.read_csv(ground_truth_url)\n",
    "df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']\n",
    "ground_truth = df_ground_truth.to_dict(orient='records')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "1bc9db20-2a45-4bbd-9dfd-2a2050809853",
   "metadata": {},
   "outputs": [],
   "source": [
    "def hit_rate(relevance_total):\n",
    "    cnt = 0\n",
    "\n",
    "    for line in relevance_total:\n",
    "        if True in line:\n",
    "            cnt = cnt + 1\n",
    "\n",
    "    return cnt / len(relevance_total)\n",
    "\n",
    "\n",
    "def mrr(relevance_total):\n",
    "    total_score = 0.0\n",
    "\n",
    "    for line in relevance_total:\n",
    "        for rank in range(len(line)):\n",
    "            if line[rank] == True:\n",
    "                total_score = total_score + 1 / (rank + 1)\n",
    "\n",
    "    return total_score / len(relevance_total)\n",
    "\n",
    "\n",
    "def evaluate(ground_truth, search_function):\n",
    "    relevance_total = []\n",
    "\n",
    "    for q in tqdm(ground_truth):\n",
    "        doc_id = q['document']\n",
    "        results = search_function(q)\n",
    "        relevance = [d['id'] == doc_id for d in results]\n",
    "        relevance_total.append(relevance)\n",
    "\n",
    "    return {\n",
    "        'hit_rate': hit_rate(relevance_total),\n",
    "        'mrr': mrr(relevance_total),\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "882fce3e-67bb-4129-9396-001c8c765583",
   "metadata": {},
   "outputs": [],
   "source": [
    "def numpy_cosine_search(q):\n",
    "    question = q['question']\n",
    "\n",
    "    v_q = embedding_model.encode(question)\n",
    "\n",
    "    return search_engine.search(v_q, num_results=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "c0152bfd-ac03-4bf1-8694-884556406775",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'What if I miss a session?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '5170565b'},\n",
       " {'text': 'The course videos are pre-recorded, you can start watching the course right now.\\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Is it going to be live? When?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '39fda9f0'},\n",
       " {'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.',\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': 'e7ba6b8a'},\n",
       " {'text': \"Yes, it's possible. See the previous answer.\",\n",
       "  'section': 'General course-related questions',\n",
       "  'question': 'Will I get a certificate if I missed the midterm project?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '1d644223'},\n",
       " {'text': 'Answer: All midterms and capstones are meant to be solo projects. [source @Alexey]',\n",
       "  'section': 'Projects (Midterm and Capstone)',\n",
       "  'question': 'Are projects solo or collaborative/group work?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'id': '4dfb5d4f'}]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numpy_cosine_search(ground_truth[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "bd494ea9-f8e0-4f68-a77b-8077dc02fe62",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "807c98a981b14bcc814c9bf7a8cd25df",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1830 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluate(ground_truth, numpy_cosine_search)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f507aae7-5d30-4f84-b3c8-6a694da48d90",
   "metadata": {},
   "source": [
    "Q5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "2911c71e-dbe0-4c37-9d0f-00c27f1cb5c9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-homework'})"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "\n",
    "es_client = Elasticsearch('http://localhost:9200') \n",
    "\n",
    "index_settings = {\n",
    "    \"settings\": {\n",
    "        \"number_of_shards\": 1,\n",
    "        \"number_of_replicas\": 0\n",
    "    },\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"section\": {\"type\": \"text\"},\n",
    "            \"question\": {\"type\": \"text\"},\n",
    "            \"course\": {\"type\": \"keyword\"},\n",
    "            \"id\": {\"type\": \"keyword\"},\n",
    "            \"question_text_vector\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"dims\": 768,\n",
    "                \"index\": True,\n",
    "                \"similarity\": \"cosine\"\n",
    "            },\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "index_name = \"course-questions-homework\"\n",
    "\n",
    "es_client.indices.delete(index=index_name, ignore_unavailable=True)\n",
    "es_client.indices.create(index=index_name, body=index_settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "0f435399-4476-4859-9059-91fdee16548e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4269fab50835482db163e45adb22e957",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/375 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for d, emb in zip(tqdm(documents), embeddings):\n",
    "    d['question_text_vector'] = emb\n",
    "    es_client.index(index=index_name, document=d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "c183ac23-4a68-4778-b24a-444265143928",
   "metadata": {},
   "outputs": [],
   "source": [
    "def elastic_search_knn(field, vector, course):\n",
    "    knn = {\n",
    "        \"field\": field,\n",
    "        \"query_vector\": vector,\n",
    "        \"k\": 5,\n",
    "        \"num_candidates\": 10000,\n",
    "        \"filter\": {\n",
    "            \"term\": {\n",
    "                \"course\": course\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "\n",
    "    search_query = {\n",
    "        \"knn\": knn,\n",
    "        \"_source\": [\"text\", \"section\", \"question\", \"course\", \"id\"]\n",
    "    }\n",
    "\n",
    "    es_results = es_client.search(\n",
    "        index=index_name,\n",
    "        body=search_query\n",
    "    )\n",
    "    \n",
    "    result_docs = []\n",
    "    \n",
    "    for hit in es_results['hits']['hits']:\n",
    "        result_docs.append(hit['_source'])\n",
    "\n",
    "    return result_docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "0375e52f-25f5-4382-86b7-6ba163f9a408",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'question': 'The course has already started. Can I still join it?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',\n",
       "  'id': 'ee58a693'},\n",
       " {'question': 'I just joined. What should I do next? How can I access course materials?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\\nOr you can just use this link: http://mlzoomcamp.com/#syllabus',\n",
       "  'id': '0a278fb2'},\n",
       " {'question': \"I filled the form, but haven't received a confirmation email. Is it normal?\",\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': \"The process is automated now, so you should receive the email eventually. If you haven’t, check your promotions tab in Gmail as well as spam.\\nIf you unsubscribed from our newsletter, you won't get course related updates too.\\nBut don't worry, it’s not a problem. To make sure you don’t miss anything, join the #course-ml-zoomcamp channel in Slack and our telegram channel with announcements. This is enough to follow the course.\",\n",
       "  'id': '6ba259b1'},\n",
       " {'question': 'Can I do the course in other languages, like R or Scala?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'Miscellaneous',\n",
       "  'text': 'Technically, yes. Advisable? Not really. Reasons:\\nSome homework(s) asks for specific python library versions.\\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\\ntx[source]',\n",
       "  'id': '9f261648'},\n",
       " {'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.',\n",
       "  'id': 'e7ba6b8a'}]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elastic_search_knn('question_text_vector', v, 'machine-learning-zoomcamp')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68b7d280-7701-4645-9872-b3fef48210a3",
   "metadata": {},
   "source": [
    "Q6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "2ee34fb1-d54d-4a97-ab85-8425156d77c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def question_vector_knn(q):\n",
    "    question = q['question']\n",
    "    course = 'machine-learning-zoomcamp'\n",
    "\n",
    "    v_q = embedding_model.encode(question)\n",
    "\n",
    "    return elastic_search_knn('question_text_vector', v_q, course)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "313a0252-2638-47ce-ba7d-789d4823e1d0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'question': 'What if I miss a session?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',\n",
       "  'id': '5170565b'},\n",
       " {'question': 'Is it going to be live? When?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': 'The course videos are pre-recorded, you can start watching the course right now.\\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',\n",
       "  'id': '39fda9f0'},\n",
       " {'question': 'The course videos are from the previous iteration. Will you release new ones or we’ll use the videos from 2021?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': 'We won’t re-record the course videos. The focus of the course and the skills we want to teach remained the same, and the videos are still up-to-date.\\nIf you haven’t taken part in the previous iteration, you can start watching the videos. It’ll be useful for you and you will learn new things. However, we recommend using Python 3.10 now instead of Python 3.8.',\n",
       "  'id': 'e7ba6b8a'},\n",
       " {'question': 'Will I get a certificate if I missed the midterm project?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'General course-related questions',\n",
       "  'text': \"Yes, it's possible. See the previous answer.\",\n",
       "  'id': '1d644223'},\n",
       " {'question': 'Are projects solo or collaborative/group work?',\n",
       "  'course': 'machine-learning-zoomcamp',\n",
       "  'section': 'Projects (Midterm and Capstone)',\n",
       "  'text': 'Answer: All midterms and capstones are meant to be solo projects. [source @Alexey]',\n",
       "  'id': '4dfb5d4f'}]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question_vector_knn(ground_truth[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "94ddddae-1591-4bb5-92a9-3c79d95f43e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "81fa0a08d4254d76b1bd29969b393b08",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1830 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluate(ground_truth, question_vector_knn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "8e69cde8-2c33-492d-b318-b7f66cb317e3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "375"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1b1c153-197b-457b-bc01-c3b52ef9fd7c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
